{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Kesalahan Tatabahasa with Tagging"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<div class=\"alert alert-info\">\n",
    "\n",
    "This tutorial is available as an IPython notebook at [Malaya/example/tatabahasa-tagging](https://github.com/huseinzol05/Malaya/tree/master/example/tatabahasa-tagging).\n",
    "    \n",
    "</div>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<div class=\"alert alert-warning\">\n",
    "\n",
    "This module only trained on standard language structure, so it is not save to use it for local language structure.\n",
    "    \n",
    "</div>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/husein/dev/malaya/malaya/tokenizer.py:214: FutureWarning: Possible nested set at position 3397\n",
      "  self.tok = re.compile(r'({})'.format('|'.join(pipeline)))\n",
      "/home/husein/dev/malaya/malaya/tokenizer.py:214: FutureWarning: Possible nested set at position 3927\n",
      "  self.tok = re.compile(r'({})'.format('|'.join(pipeline)))\n"
     ]
    }
   ],
   "source": [
    "import malaya\n",
    "from pprint import pprint"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### List available HuggingFace models"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'mesolitica/finetune-tatabahasa-t5-tiny-standard-bahasa-cased': {'Size (MB)': 139,\n",
       "  'exactly-match': 0.7665198237,\n",
       "  'f1': 0.970908229,\n",
       "  'exactly-match-tags': 0.87404885863,\n",
       "  'f1-tags': 0.9878723587,\n",
       "  'Suggested length': 256},\n",
       " 'mesolitica/finetune-tatabahasa-t5-small-standard-bahasa-cased': {'Size (MB)': 242,\n",
       "  'exactly-match': 0.8145774929,\n",
       "  'f1': 0.9781278,\n",
       "  'exactly-match-tags': 0.89447336,\n",
       "  'f1-tags': 0.990597377,\n",
       "  'Suggested length': 256},\n",
       " 'mesolitica/finetune-tatabahasa-t5-base-standard-bahasa-cased': {'Size (MB)': 892,\n",
       "  'exactly-match': 0.760913095,\n",
       "  'f1': 0.970136249,\n",
       "  'exactly-match-tags': 0.865839,\n",
       "  'f1-tags': 0.9868999035,\n",
       "  'Suggested length': 256}}"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "malaya.tatabahasa.available_huggingface"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Supported kesalahan tatabahasa\n",
    "\n",
    "For full description, check out https://tatabahasabm.tripod.com/tata/salahtata.htm"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[{'class': 0, 'Description': 'PAD', 'salah': '', 'betul': ''},\n",
       " {'class': 1, 'Description': 'kesambungan subwords', 'salah': '', 'betul': ''},\n",
       " {'class': 2, 'Description': 'tiada kesalahan', 'salah': '', 'betul': ''},\n",
       " {'class': 3,\n",
       "  'Description': 'kesalahan frasa nama, Perkara yang diterangkan mesti mendahului \"penerang\"',\n",
       "  'salah': 'Cili sos',\n",
       "  'betul': 'sos cili'},\n",
       " {'class': 4,\n",
       "  'Description': 'kesalahan kata jamak',\n",
       "  'salah': 'mereka-mereka',\n",
       "  'betul': 'mereka'},\n",
       " {'class': 5,\n",
       "  'Description': 'kesalahan kata penguat',\n",
       "  'salah': 'sangat tinggi sekali',\n",
       "  'betul': 'sangat tinggi'},\n",
       " {'class': 6,\n",
       "  'Description': 'kata adjektif dan imbuhan \"ter\" tanpa penguat.',\n",
       "  'salah': 'Sani mendapat markah yang tertinggi sekali.',\n",
       "  'betul': 'Sani mendapat markah yang tertinggi.'},\n",
       " {'class': 7,\n",
       "  'Description': 'kesalahan kata hubung',\n",
       "  'salah': 'Sally sedang membaca bila saya tiba di rumahnya.',\n",
       "  'betul': 'Sally sedang membaca apabila saya tiba di rumahnya.'},\n",
       " {'class': 8,\n",
       "  'Description': 'kesalahan kata bilangan',\n",
       "  'salah': 'Beribu peniaga tidak membayar cukai pendapatan.',\n",
       "  'betul': 'Beribu-ribu peniaga tidak membayar cukai pendapatan'},\n",
       " {'class': 9,\n",
       "  'Description': 'kesalahan kata sendi',\n",
       "  'salah': 'Umar telah berpindah daripada sekolah ini bulan lalu.',\n",
       "  'betul': 'Umar telah berpindah dari sekolah ini bulan lalu.'},\n",
       " {'class': 10,\n",
       "  'Description': 'kesalahan penjodoh bilangan',\n",
       "  'salah': 'Setiap orang pelajar',\n",
       "  'betul': 'Setiap pelajar.'},\n",
       " {'class': 11,\n",
       "  'Description': 'kesalahan kata ganti diri',\n",
       "  'salah': 'Pencuri itu telah ditangkap. Beliau dibawa ke balai polis.',\n",
       "  'betul': 'Pencuri itu telah ditangkap. Dia dibawa ke balai polis.'},\n",
       " {'class': 12,\n",
       "  'Description': 'kesalahan ayat pasif',\n",
       "  'salah': 'Cerpen itu telah dikarang oleh saya.',\n",
       "  'betul': 'Cerpen itu telah saya karang.'},\n",
       " {'class': 13,\n",
       "  'Description': 'kesalahan kata tanya',\n",
       "  'salah': 'Kamu berasal dari manakah ?',\n",
       "  'betul': 'Kamu berasal dari mana ?'},\n",
       " {'class': 14,\n",
       "  'Description': 'kesalahan tanda baca',\n",
       "  'salah': 'Kamu berasal dari manakah .',\n",
       "  'betul': 'Kamu berasal dari mana ?'},\n",
       " {'class': 15,\n",
       "  'Description': 'kesalahan kata kerja tak transitif',\n",
       "  'salah': 'Dia kata kepada saya',\n",
       "  'betul': 'Dia berkata kepada saya'},\n",
       " {'class': 16,\n",
       "  'Description': 'kesalahan kata kerja transitif',\n",
       "  'salah': 'Dia suka baca buku',\n",
       "  'betul': 'Dia suka membaca buku'},\n",
       " {'class': 17,\n",
       "  'Description': 'penggunaan kata yang tidak tepat',\n",
       "  'salah': 'Tembuk Besar negeri Cina dibina oleh Shih Huang Ti.',\n",
       "  'betul': 'Tembok Besar negeri Cina dibina oleh Shih Huang Ti'}]"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "malaya.tatabahasa.describe"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Right now we only able to predict up to 15 different kesalahan tatabahasa, hopefully in the future we can scale this up."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Load HuggingFace model\n",
    "\n",
    "```python\n",
    "def huggingface(\n",
    "    model: str = 'mesolitica/finetune-tatabahasa-t5-small-standard-bahasa-cased',\n",
    "    force_check: bool = True,\n",
    "    **kwargs,\n",
    "):\n",
    "    \"\"\"\n",
    "    Load HuggingFace model to fix kesalahan tatabahasa.\n",
    "\n",
    "    Parameters\n",
    "    ----------\n",
    "    model: str, optional (default='mesolitica/finetune-tatabahasa-t5-small-standard-bahasa-cased')\n",
    "        Check available models at `malaya.tatabahasa.available_huggingface`.\n",
    "    force_check: bool, optional (default=True)\n",
    "        Force check model one of malaya model.\n",
    "        Set to False if you have your own huggingface model.\n",
    "\n",
    "    Returns\n",
    "    -------\n",
    "    result: malaya.torch_model.huggingface.Tatabahasa\n",
    "    \"\"\"\n",
    "```"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Loading the tokenizer from the `special_tokens_map.json` and the `added_tokens.json` will be removed in `transformers 5`,  it is kept for forward compatibility, but it is recommended to update your `tokenizer_config.json` by uploading it again. You will see the new `added_tokens_decoder` attribute that will store the relevant information.\n",
      "You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. If you see this, DO NOT PANIC! This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565\n"
     ]
    }
   ],
   "source": [
    "model = malaya.tatabahasa.huggingface()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Predict\n",
    "\n",
    "```python\n",
    "def generate(\n",
    "    self,\n",
    "    strings: List[str],\n",
    "    **kwargs,\n",
    "):\n",
    "    \"\"\"\n",
    "    Fix kesalahan tatatabahasa.\n",
    "\n",
    "    Parameters\n",
    "    ----------\n",
    "    strings : List[str]\n",
    "    **kwargs: vector arguments pass to huggingface `generate` method.\n",
    "        Read more at https://huggingface.co/docs/transformers/main_classes/text_generation\n",
    "        Fix kesalahan tatabahasa supported all decoding methods except beam.\n",
    "\n",
    "    Returns\n",
    "    -------\n",
    "    result: List[Tuple[str, int]]\n",
    "    \"\"\"\n",
    "```"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Randomly picked string in bahasa melayu wikipedia."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "# https://ms.wikipedia.org/wiki/Bola_sepak\n",
    "string = 'Pada amnya, hanya penjaga gol sahaja yang dibenarkan menyentuh bola dengan tangan di dalam kawasan golnya'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[[('Pada', 2),\n",
       "  ('amnya', 2),\n",
       "  (',', 2),\n",
       "  ('hanya', 2),\n",
       "  ('penjaga', 2),\n",
       "  ('gol', 2),\n",
       "  ('sahaja', 2),\n",
       "  ('yang', 2),\n",
       "  ('dibenarkan', 2),\n",
       "  ('menyentuh', 2),\n",
       "  ('bola', 2),\n",
       "  ('dengan', 2),\n",
       "  ('tangan', 2),\n",
       "  ('di', 2),\n",
       "  ('dalam', 2),\n",
       "  ('kawasan', 2),\n",
       "  ('golnya', 2)]]"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model.generate([string], max_length = 256)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Now assumed we have kesalahan frasa nama, from `penjaga gol` become `gol penjaga`."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "# https://ms.wikipedia.org/wiki/Bola_sepak\n",
    "string = 'Pada amnya, hanya gol penjaga sahaja yang dibenarkan menyentuh bola dengan tangan di dalam kawasan golnya'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[[('Pada', 2),\n",
       "  ('amnya', 2),\n",
       "  (',', 2),\n",
       "  ('hanya', 2),\n",
       "  ('penjaga', 3),\n",
       "  ('gol', 3),\n",
       "  ('sahaja', 2),\n",
       "  ('yang', 2),\n",
       "  ('dibenarkan', 2),\n",
       "  ('menyentuh', 2),\n",
       "  ('bola', 2),\n",
       "  ('dengan', 2),\n",
       "  ('tangan', 2),\n",
       "  ('di', 2),\n",
       "  ('dalam', 2),\n",
       "  ('kawasan', 2),\n",
       "  ('golnya', 2)]]"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model.generate([string], max_length = 256)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[[('Sani', 2),\n",
       "  ('mendapat', 2),\n",
       "  ('markah', 2),\n",
       "  ('yang', 2),\n",
       "  ('tertinggi', 6),\n",
       "  ('.', 2)],\n",
       " [('Hassan', 2),\n",
       "  ('ialah', 2),\n",
       "  ('peserta', 2),\n",
       "  ('yang', 2),\n",
       "  ('termuda', 6),\n",
       "  ('dalam', 2),\n",
       "  ('pertandingan', 2),\n",
       "  ('itu', 2),\n",
       "  ('.', 2)]]"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "string = 'Sani mendapat markah yang tertinggi sekali.'\n",
    "string1 = 'Hassan ialah peserta yang termuda sekali dalam pertandingan itu.'\n",
    "model.generate([string, string1], max_length = 256)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[[('Dia', 2), ('berkata', 15), ('kepada', 2), ('saya', 2), ('.', 2)]]"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "string = 'Dia kata kepada saya.'\n",
    "model.generate([string], max_length = 256)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "100"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pickle\n",
    "\n",
    "with open('tests/dataset-tatabahasa.pkl', 'rb') as fopen:\n",
    "    test_set = pickle.load(fopen)\n",
    "    \n",
    "len(test_set)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_xy(row):\n",
    "    x, y, tag = [], [], []\n",
    "\n",
    "    for i in range(len(row[0])):\n",
    "        t = [row[0][i][0]]\n",
    "        y.extend(t)\n",
    "        t = [row[1][i][0]]\n",
    "        x.extend(t)\n",
    "        tag.extend([row[1][i][1]] * len(t))\n",
    "\n",
    "    return ' '.join(x), ' '.join(y), tag"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "('Dirk Jan Klaas \" Klaas-Jan \" Huntelaar ( lahir 12 Ogos 1983 ) merupakan pemain bola sepak Belanda yang bermain seperti posisi penyerang !',\n",
       " 'Dirk Jan Klaas \" Klaas-Jan \" Huntelaar ( lahir 12 Ogos 1983 ) merupakan pemain bola sepak Belanda yang bermain di posisi penyerang .',\n",
       " [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 9, 2, 2, 14])"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "x, y, t = get_xy(test_set[0])\n",
    "x, y, t"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[[('Dirk', 2),\n",
       "  ('Jan', 2),\n",
       "  ('Klaas', 2),\n",
       "  ('\"', 2),\n",
       "  ('Klaas-Jan', 2),\n",
       "  ('\"', 2),\n",
       "  ('Huntelaar', 2),\n",
       "  ('(', 2),\n",
       "  ('lahir', 2),\n",
       "  ('12', 2),\n",
       "  ('Ogos', 2),\n",
       "  ('1983', 2),\n",
       "  (')', 2),\n",
       "  ('merupakan', 2),\n",
       "  ('pemain', 2),\n",
       "  ('bola', 2),\n",
       "  ('sepak', 2),\n",
       "  ('Belanda', 2),\n",
       "  ('yang', 2),\n",
       "  ('bermain', 2),\n",
       "  ('di', 9),\n",
       "  ('posisi', 2),\n",
       "  ('penyerang', 2),\n",
       "  ('.', 14)]]"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model.generate([x], max_length = 256)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "('Pada tahun 2002 , kedua-dua gol beliau menduduki tempat ke-6 dalam 100 Greatest Sporting Moments oleh saluran Channel 4 UK .',\n",
       " 'Pada tahun 2002 , kedua-dua gol ini menduduki tempat ke-6 dalam 100 Greatest Sporting Moments oleh saluran Channel 4 UK .',\n",
       " [2, 2, 2, 2, 2, 2, 11, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "x, y, t = get_xy(test_set[-1])\n",
    "x, y, t"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[[('Pada', 2),\n",
       "  ('tahun', 2),\n",
       "  ('2002', 2),\n",
       "  (',', 2),\n",
       "  ('kedua-dua', 2),\n",
       "  ('gol', 2),\n",
       "  ('beliau', 11),\n",
       "  ('menduduki', 2),\n",
       "  ('tempat', 2),\n",
       "  ('ke-6', 2),\n",
       "  ('dalam', 2),\n",
       "  ('100', 2),\n",
       "  ('Greatest', 2),\n",
       "  ('Sporting', 2),\n",
       "  ('Moments', 2),\n",
       "  ('oleh', 2),\n",
       "  ('saluran', 2),\n",
       "  ('Channel', 2),\n",
       "  ('4', 2),\n",
       "  ('UK', 2),\n",
       "  ('.', 2)]]"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model.generate([x], max_length = 256)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "('Gol inilah yang bergelar Goal of the Century dengan undian Internet 2000 sejak FIFA .',\n",
       " 'Gol inilah yang bergelar Goal of the Century di undian Internet 2000 oleh FIFA .',\n",
       " [2, 2, 2, 2, 2, 2, 2, 2, 9, 2, 2, 2, 9, 2, 2])"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "x, y, t = get_xy(test_set[-2])\n",
    "x, y, t"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[[('Gol', 2),\n",
       "  ('inilah', 2),\n",
       "  ('yang', 2),\n",
       "  ('bergelar', 2),\n",
       "  ('Goal', 2),\n",
       "  ('of', 2),\n",
       "  ('the', 2),\n",
       "  ('Century', 2),\n",
       "  ('dalam', 9),\n",
       "  ('undian', 2),\n",
       "  ('Internet', 2),\n",
       "  ('2000', 2),\n",
       "  ('oleh', 9),\n",
       "  ('FIFA', 2),\n",
       "  ('.', 2)]]"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model.generate([x], max_length = 256)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "('Beliau mengambil bola dalam kawasan kepul diri lalu pusing dan luru lebih separuh padang sambil menyentuh bola 11 kali , memintas lima pemain England : ( Glenn Hoddle , Peter Reid , Kenny Sansom , Terry Butcher , dan Terry Fenwick ) serta penjaga gawang Peter Shilton .',\n",
       " 'Beliau mengambil bola di kawasan pasukan diri lalu berpusing-pusing dan meluru lebih separuh padang sambil menyentuh bola 11 kali , memintas lima pemain England : ( Glenn Hoddle , Peter Reid , Kenny Sansom , Terry Butcher , dan Terry Fenwick ) serta penjaga gawang Peter Shilton .',\n",
       " [2,\n",
       "  2,\n",
       "  2,\n",
       "  9,\n",
       "  2,\n",
       "  10,\n",
       "  2,\n",
       "  2,\n",
       "  15,\n",
       "  2,\n",
       "  15,\n",
       "  2,\n",
       "  2,\n",
       "  2,\n",
       "  2,\n",
       "  2,\n",
       "  2,\n",
       "  2,\n",
       "  2,\n",
       "  2,\n",
       "  2,\n",
       "  2,\n",
       "  2,\n",
       "  2,\n",
       "  2,\n",
       "  2,\n",
       "  2,\n",
       "  2,\n",
       "  2,\n",
       "  2,\n",
       "  2,\n",
       "  2,\n",
       "  2,\n",
       "  2,\n",
       "  2,\n",
       "  2,\n",
       "  2,\n",
       "  2,\n",
       "  2,\n",
       "  2,\n",
       "  2,\n",
       "  2,\n",
       "  2,\n",
       "  2,\n",
       "  2,\n",
       "  2,\n",
       "  2,\n",
       "  2])"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "x, y, t = get_xy(test_set[-3])\n",
    "x, y, t"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[[('Beliau', 2),\n",
       "  ('mengambil', 2),\n",
       "  ('bola', 2),\n",
       "  ('dalam', 2),\n",
       "  ('kawasan', 2),\n",
       "  ('pasukan', 10),\n",
       "  ('berdiri', 15),\n",
       "  ('lalu', 2),\n",
       "  ('berpusing', 15),\n",
       "  ('dan', 2),\n",
       "  ('meluru', 15),\n",
       "  ('lebih', 2),\n",
       "  ('separuh', 2),\n",
       "  ('padang', 2),\n",
       "  ('sambil', 2),\n",
       "  ('menyentuh', 2),\n",
       "  ('bola', 2),\n",
       "  ('11', 2),\n",
       "  ('kali', 2),\n",
       "  (',', 2),\n",
       "  ('memintas', 2),\n",
       "  ('lima', 2),\n",
       "  ('pemain', 2),\n",
       "  ('England', 2),\n",
       "  (':', 2),\n",
       "  ('(', 2),\n",
       "  ('Glenn', 2),\n",
       "  ('Hoddle', 2),\n",
       "  (',', 2),\n",
       "  ('Peter', 2),\n",
       "  ('Reid', 2),\n",
       "  (',', 2),\n",
       "  ('Kenny', 2),\n",
       "  ('Sansom', 2),\n",
       "  (',', 2),\n",
       "  ('Terry', 2),\n",
       "  ('Butcher', 2),\n",
       "  (',', 2),\n",
       "  ('dan', 2),\n",
       "  ('Terry', 2),\n",
       "  ('Fenwick', 2),\n",
       "  (')', 2),\n",
       "  ('serta', 2),\n",
       "  ('penjaga', 2),\n",
       "  ('gawang', 2),\n",
       "  ('Peter', 2),\n",
       "  ('Shilton', 2),\n",
       "  ('.', 2)]]"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model.generate([x], max_length = 256)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### More examples\n",
    "\n",
    "I just copy pasted from https://ms.wikipedia.org/wiki/Kesalahan_biasa_tatabahasa_Melayu"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[[('Tidak', 2),\n",
       "  ('ada', 2),\n",
       "  ('apa', 2),\n",
       "  ('yang', 2),\n",
       "  ('mereka', 2),\n",
       "  ('risaukan', 2),\n",
       "  ('waktu', 2),\n",
       "  ('itu.', 2)]]"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "string = 'Tidak ada apa yang mereka risaukan waktu itu.'\n",
    "model.generate([string], max_length = 256)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[[('Ayahnya', 2),\n",
       "  ('setuju', 2),\n",
       "  ('dan', 7),\n",
       "  ('melanggar', 2),\n",
       "  ('syarat', 2),\n",
       "  ('yang', 2),\n",
       "  ('dia', 2),\n",
       "  ('sendiri', 2),\n",
       "  ('menetapkan', 2),\n",
       "  ('.', 2)]]"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "string = 'Ayahnya setuju walaupun melanggar syarat yang dia sendiri menetapkan.'\n",
    "model.generate([string], max_length = 256)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[[('Semuanya', 2), ('dia', 2), ('kenal', 2), ('.', 2)]]"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "string = 'Semuanya dia kenal.'\n",
    "model.generate([string], max_length = 256)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[[('Dia', 2),\n",
       "  ('menjawab', 2),\n",
       "  ('seperti', 2),\n",
       "  ('disuruh-suruh', 2),\n",
       "  ('oleh', 2),\n",
       "  ('kuasa', 2),\n",
       "  ('yang', 2),\n",
       "  ('dia', 2),\n",
       "  ('tidak', 2),\n",
       "  ('tahu', 2),\n",
       "  ('dari', 2),\n",
       "  ('mana', 2),\n",
       "  ('puncanya.', 2)]]"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "string = 'Dia menjawab seperti disuruh-suruh oleh kuasa yang dia tidak tahu dari mana puncanya.'\n",
    "model.generate([string], max_length = 256)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[[('Bola', 2),\n",
       "  ('ini', 2),\n",
       "  ('ditendang', 2),\n",
       "  ('oleh', 2),\n",
       "  ('saya', 2),\n",
       "  ('.', 2)]]"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "string = 'Bola ini ditendang oleh saya.'\n",
    "model.generate([string], max_length = 256)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[[('Makanan', 2),\n",
       "  ('ini', 11),\n",
       "  ('kamu', 2),\n",
       "  ('telah', 2),\n",
       "  ('makan', 2),\n",
       "  ('.', 14)]]"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "string = 'Makanan ini kamu telah makan?'\n",
    "model.generate([string], max_length = 256)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[[('Segala', 2),\n",
       "  ('perubahan', 2),\n",
       "  ('yang', 2),\n",
       "  ('berlaku', 2),\n",
       "  ('kita', 2),\n",
       "  ('akan', 2),\n",
       "  ('menghadapi', 2),\n",
       "  ('sama-sama', 2),\n",
       "  ('.', 2)]]"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "string = 'Segala perubahan yang berlaku kita akan menghadapi sama-sama.'\n",
    "model.generate([string], max_length = 256)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[[('Kakak', 2),\n",
       "  ('saya', 2),\n",
       "  ('sedang', 2),\n",
       "  ('memasak', 2),\n",
       "  ('gulai', 2),\n",
       "  ('nangka.', 2),\n",
       "  ('Dia', 2),\n",
       "  ('menyenduk', 2),\n",
       "  ('seketul', 2),\n",
       "  ('gulai', 3),\n",
       "  ('nangka', 3),\n",
       "  ('dan', 2),\n",
       "  ('menyuruh', 2),\n",
       "  ('saya', 2),\n",
       "  ('merasanya.', 2)]]"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "string = 'Kakak saya sedang memasak gulai nangka. Dia menyenduk seketul nangka gulai dan menyuruh saya merasanya.'\n",
    "model.generate([string], max_length = 256)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[[('Sally', 2),\n",
       "  ('sedang', 2),\n",
       "  ('membaca', 2),\n",
       "  ('bila', 2),\n",
       "  ('saya', 11),\n",
       "  ('tiba', 2),\n",
       "  ('di', 2),\n",
       "  ('rumahnya.', 2)]]"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "string = 'Sally sedang membaca bila saya tiba di rumahnya.'\n",
    "model.generate([string], max_length = 256)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[[('Badannya', 2),\n",
       "  ('besar', 2),\n",
       "  ('dan', 7),\n",
       "  ('kakinya', 2),\n",
       "  ('kecil', 2),\n",
       "  ('.', 2)]]"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "string = 'Badannya besar kecuali kakinya kecil.'\n",
    "model.generate([string], max_length = 256)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[[('Beribu', 2),\n",
       "  ('peniaga', 2),\n",
       "  ('tidak', 2),\n",
       "  ('membayar', 2),\n",
       "  ('cukai', 2),\n",
       "  ('pendapatan.', 2)]]"
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "string = 'Beribu peniaga tidak membayar cukai pendapatan.'\n",
    "model.generate([string], max_length = 256)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[[('Setengah', 2),\n",
       "  ('remaja', 2),\n",
       "  ('suka', 2),\n",
       "  ('membuang', 2),\n",
       "  ('masa', 2),\n",
       "  ('di', 2),\n",
       "  ('pasar', 2),\n",
       "  ('raya.', 2)]]"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "string = 'Setengah remaja suka membuang masa di pasar raya.'\n",
    "model.generate([string], max_length = 256)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[[('Umar', 2),\n",
       "  ('telah', 2),\n",
       "  ('berpindah', 2),\n",
       "  ('ke', 9),\n",
       "  ('sekolah', 2),\n",
       "  ('ini', 2),\n",
       "  ('bulan', 2),\n",
       "  ('lalu', 2),\n",
       "  ('.', 2)]]"
      ]
     },
     "execution_count": 33,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "string = 'Umar telah berpindah daripada sekolah ini bulan lalu.'\n",
    "model.generate([string], max_length = 256)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[[('Para', 4), ('peserta', 2), ('sedang', 2), ('berbaris', 2), ('.', 2)]]"
      ]
     },
     "execution_count": 34,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "string = 'Para-para peserta sedang berbaris.'\n",
    "model.generate([string], max_length = 256)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.10"
  },
  "varInspector": {
   "cols": {
    "lenName": 16,
    "lenType": 16,
    "lenVar": 40
   },
   "kernels_config": {
    "python": {
     "delete_cmd_postfix": "",
     "delete_cmd_prefix": "del ",
     "library": "var_list.py",
     "varRefreshCmd": "print(var_dic_list())"
    },
    "r": {
     "delete_cmd_postfix": ") ",
     "delete_cmd_prefix": "rm(",
     "library": "var_list.r",
     "varRefreshCmd": "cat(var_dic_list()) "
    }
   },
   "types_to_exclude": [
    "module",
    "function",
    "builtin_function_or_method",
    "instance",
    "_Feature"
   ],
   "window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
